# ------------------------------
# ' check the distribution of categorical measures across different counties
# ------------------------------

# custom function to measure SD for weighted sd
weighted.sd <- function(x, w = NULL, na.rm = FALSE) {
	if (na.rm) {
		na <- is.na(x) | is.na(w)
		x <- x[!na]
		w <- w[!na]
	}
	weighted_sd = sqrt(sum(w * (x - weighted.mean(x, w)) ^ 2) / (sum(w) - 1))
	return(weighted_sd)
}

# load processed data : 
rdata = read_fst(file.path(processed_path,paste0('merged_','county','_v1.fst')), as.data.table=TRUE)

#------------------------------------------------------------------------------
# Data Preprocessing
rdata[, Race5 := Race4]
rdata[Hisp == 1, Race5 := 5]

# drop other race categories
rdata[Race5 == 9, Race5 := NA] 

rdata[, MarStat5 := MarStat6]
rdata[MarStat5 == 6, MarStat5 := 5]

# drop younger than 15
rdata = rdata[AgeGrp4 != 0,]

rdata[, Suic := DSID]
rdata[, St := state]

# drop when state indicator is missing
data = rdata[!is.na(St),]
	
data$ObsWgt0 = data[[paste0('county','_weight')]]		
	
#==============================================================================
list_var = c('Sex','AgeGrp4','Race5','MarStat5','BornUSA', 'UnEmpl', 'PhysProb')

list_target_category = list()
list_target_category[[1]] = list(name='Sex',value=c(1,2),value_name=c('Male','Female'))
list_target_category[[2]] = list(name='AgeGrp4',value=c(1,2,3,4),value_name=c('15-24','25-44','45-64','65+'))
list_target_category[[3]] = list(name='Race5',value=c(1,2,3,4,5),value_name=c('White','Black','Indian|Alaska','Asian','Hispanic'))
list_target_category[[4]] = list(name='MarStat5',value=c(1,2,3,4,5),value_name=c('Married','Widowed','Divorced','Separated','Never Married'))
list_target_category[[5]] = list(name='BornUSA',value=c(0,1),value_name=c('not US-born','US-born'))
list_target_category[[6]] = list(name='UnEmpl',value=c(0,1),value_name=c('Employed','Unemployed'))
list_target_category[[7]] = list(name='PhysProb',value=c(0,1),value_name=c('Healthy','Physical Problem'))

find_category = function(target){
	i = which(unlist(lapply(list_target_category,function(x) x$name)) %in% target)
	value = list_target_category[[i]]$value 
	value_name = list_target_category[[i]]$value_name
	return(list(value=value,value_name=value_name))
}

for (var in list_var){

		prop_rdata = data[, .(N=sum(ObsWgt0)), by = c('county', var)][, prop := 100 * N / sum(N), by = c('county')]
		prop_rdata = na.omit(prop_rdata)
	
		# add standardized variables by each category
		prop_rdata[,mean_prop := mean(prop), by=var] # overall mean
		prop_rdata[,sd_prop := sd(prop), by=var] # overall sd 
		prop_rdata[,std_prop := (prop - mean_prop) / sd_prop]
		prop_rdata[, `:=`(N = NULL, mean_prop = NULL, sd_prop = NULL)]
		prop_rdata = melt(prop_rdata, id.vars=c('county',var))
		
		prop_rdata[, type := factor(variable, 
			levels=c('prop','std_prop'),
			labels=c('Raw Percentage','Standardized %'))]

		prop_rdata[[var]] = factor(prop_rdata[[var]], 
			levels=find_category(var)$value,
			labels=find_category(var)$value_name)
		n_category = length(find_category(var)$value)

		lsize = 8 
		legend_name = gsub('\\d','',var)
		
		if (legend_name == 'Female') legend_name <- 'Sex'
		
		if (legend_name == 'MarStat') {
			legend_name <- 'Marital Status'
			lsize = 8
		}
		
		if (grepl('UnEmpl',legend_name)) legend_name <- 'Employment Status'
		if (grepl('PhysProb',legend_name)) legend_name <- 'Physical Problem'
		if (legend_name == 'BornUSA') legend_name <- 'USA Born'
		if (legend_name == 'AgeGrp') legend_name <- 'Age Group'

		setnames(prop_rdata,var, 'category')
		p = ggplot(prop_rdata, aes(x=category,y=value,fill=category)) +
			geom_violin(trim=FALSE) +
			stat_summary(fun.y=mean, geom="point", color="black",size=2, shape=3)+
			facet_wrap(~type, scale='free')+
			coord_flip()+
			theme_pubr()+
		 	theme(legend.position = "top",
		 		legend.text=element_text(size=lsize),legend.title=element_blank()) +
			labs(title = NULL,
				x='', y="The distribution of each category at county levels (% or SD units)") + 
			guides(color=guide_legend(title=legend_name),
			fill=guide_legend(title=legend_name),
			shape=guide_legend(title=legend_name)) 
		file_name = paste0('category_distribution_',var,'.png')
		ggsave(file=file.path(figure_path, file_name),plot=p, width=7,height=4)
}







